################################################################################
# Fear and Favoritism in the Time of COVID-19
# Master Replication Script
#
# This script replicates all tables in the main paper and appendix
#
# Authors: Han, Ho, Rhee, Tabakis
# Last updated: 2024
################################################################################

# Clear workspace
rm(list = ls())

# ==============================================================================
# SETUP
# ==============================================================================

# Install required packages if not already installed
required_packages <- c("dplyr", "haven", "vtable", "tidyr", "broom", "gt",
                       "AER", "texreg", "fixest", "forcats", "kableExtra",
                       "mediation", "sandwich", "lmtest", "stargazer")

for (pkg in required_packages) {
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg)
    library(pkg, character.only = TRUE)
  }
}

# Set working directory to the replication package folder
# Users should modify this path to match their local setup

# Try to detect the script location automatically, otherwise use manual path
tryCatch({
  # For RStudio
  setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
  setwd("..")
}, error = function(e) {
  # For command line execution - modify this path as needed
  setwd("/Users/irhee/Dropbox/_doc/1_working/2_research/COVID_Replication/replication_package")
})

# Create output directories if they don't exist
dir.create("output/tables", recursive = TRUE, showWarnings = FALSE)
dir.create("output/figures", recursive = TRUE, showWarnings = FALSE)

# ==============================================================================
# LOAD AND PREPARE DATA
# ==============================================================================

cat("\n=== Loading Data ===\n")

# Load main survey data
df_main <- read.csv("data/survey_data.csv", encoding = "UTF-8")

# Load auxiliary datasets
gps_all <- read_dta("data/gps_measures.dta")
count_vars <- read_dta("data/media_counts.dta")
neighbor_vars <- read_dta("data/neighbor_preferences.dta")

# Standardize column names to lowercase
names(df_main) <- tolower(names(df_main))
names(gps_all) <- tolower(names(gps_all))
names(count_vars) <- tolower(names(count_vars))
names(neighbor_vars) <- tolower(names(neighbor_vars))

# Merge all datasets
df_main <- df_main %>%
  left_join(gps_all, by = "id") %>%
  left_join(count_vars, by = "id") %>%
  left_join(neighbor_vars, by = "id")

cat("Data loaded successfully. N =", nrow(df_main), "\n")

# ==============================================================================
# VARIABLE CREATION
# ==============================================================================

cat("\n=== Creating Variables ===\n")

# --- Outcome Variables ---

# Donation variables (already numeric in dataset)
# q77_n2 = Outgroup donation (Korea Support Center for Foreign Workers)
# q77_n3 = Ingroup donation (Korean Red Cross)
# q77_n4 = Total donation

df_main <- df_main %>%
  mutate(
    # Create in-group minus out-group difference
    out_in = q77_n2 - q77_n3,

    # Hyperlink click (behavioral measure) - main survey only
    # q110 coded: 예=clicked, 아니오=not clicked
    # Note: Paper uses only main survey q110 (N=4188), not combined with req110
    q110_ = case_when(
      q110 == "예" ~ 1,
      q110 == "아니오" ~ 0,
      TRUE ~ NA_real_
    )
  )

# --- Treatment Variable ---
df_main <- df_main %>%
  mutate(
    treat = ifelse(gubun %in% c("Group. Fear_1 : Q74 -> Q76 -> Q77",
                                 "Group. Fear_2 : Q76 -> Q74 -> Q77"), 1, 0)
  )

# --- Instrumental Variable (Reported Fear) ---
df_main <- df_main %>%
  mutate(
    iv = case_when(
      q81_0 == "많이 느낀다" ~ 4,
      q81_0 == "조금 느낀다" ~ 3,
      q81_0 == "아주 조금 느낀다" ~ 2,
      q81_0 == "전혀 느끼지 않는다" ~ 1,
      TRUE ~ NA_real_
    )
  )

# --- Emotion Variables (for manipulation check) ---
emotion_vars_raw <- c("q81_0", "q81_0_n2", "q81_0_n3", "q81_0_n4", "q81_0_n5", "q81_0_n6")

for (x in emotion_vars_raw) {
  new_var <- paste0(x, "_")
  df_main[[new_var]] <- case_when(
    df_main[[x]] == "많이 느낀다" ~ 4,
    df_main[[x]] == "조금 느낀다" ~ 3,
    df_main[[x]] == "아주 조금 느낀다" ~ 2,
    df_main[[x]] == "전혀 느끼지 않는다" ~ 1,
    TRUE ~ NA_real_
  )
}

# --- Demographic Variables ---
df_main <- df_main %>%
  mutate(
    sex = ifelse(sq1 == "남성", 1, 0),
    age = sq2,

    # Education categories
    edu = case_when(
      sq4 == "중학교 졸업 이하" ~ 1,
      sq4 == "고등학교 졸업" ~ 2,
      sq4 == "대학 재학 중" ~ 3,
      sq4 == "대학 졸업" ~ 4,
      sq4 == "대학원 재학 중" ~ 5,
      sq4 == "대학원 졸업" ~ 6
    ),

    # Religion categories
    rel = case_when(
      q68 == "개신교" ~ 1,
      q68 == "불교" ~ 2,
      q68 == "천주교" ~ 3,
      q68 == "종교 없음" ~ 4,
      q68 == "기타" ~ 5
    ),

    # Political ideology (1=Very liberal to 5=Very conservative)
    ideology = case_when(
      q66 == "매우 진보적" ~ 1,
      q66 == "다소 진보적" ~ 2,
      q66 == "중도" ~ 3,
      q66 == "다소 보수적" ~ 4,
      q66 == "매우 보수적" ~ 5,
      TRUE ~ NA_real_
    ),

    # Ruling party support
    q107 = trimws(q107),
    q108 = trimws(q108),
    vote_inc = ifelse(q107 == "예" & q108 == "더불어민주당", 1, 0),

    # Media consumption
    news_consumption = case_when(
      q102 == "전혀" ~ 1,
      q102 == "별로" ~ 2,
      q102 == "어느 정도" ~ 3,
      q102 == "매우 많이" ~ 4,
      TRUE ~ NA_real_
    ),

    sns_frequency = case_when(
      q104 == "전혀" ~ 1,
      q104 == "가끔" ~ 2,
      q104 == "종종" ~ 3,
      q104 == "매우 자주" ~ 4,
      TRUE ~ NA_real_
    )
  )

# --- KGSS Outcome Variables (want groups to decrease) ---
kgss_vars <- c("q82", "q82_n2", "q82_n3", "q82_n4", "q82_n5", "q82_n6")
for (x in kgss_vars) {
  df_main[[paste0(x, "_")]] <- case_when(
    df_main[[x]] == "매우 증가하기를 바람" ~ 1,
    df_main[[x]] == "다소 증가하기를 바람" ~ 2,
    df_main[[x]] == "변화 없기를 바람" ~ 3,
    df_main[[x]] == "다소 감소하기를 바람" ~ 4,
    df_main[[x]] == "매우 감소하기를 바람" ~ 5,
    df_main[[x]] == "모르겠다" ~ NA_real_,
    TRUE ~ NA_real_
  )
}

# Convert factor variables
df_main$sq3 <- as.factor(df_main$sq3)
df_main$sq4 <- as.factor(df_main$sq4)
df_main$q68 <- as.factor(df_main$q68)
df_main$q103 <- as.factor(df_main$q103)

cat("Variables created successfully.\n")

# ==============================================================================
# DEFINE CONTROL VARIABLE SETS
# ==============================================================================

# GPS preference controls
gps_controls <- c("gps_time", "gps_risk", "gps_alt", "gps_pr", "gps_nr", "gps_trust")

# Demographic controls
demo_controls <- c("sex", "age", "sq3", "sq4", "q68")

# Political controls
poli_controls <- c("ideology", "vote_inc")

# Media controls
media_controls <- c("news_consumption", "q103", "sns_frequency", "q105_count", "q106_count")

# Control sets for regression specifications
control_sets <- list(
  c1 = NULL,
  c2 = gps_controls,
  c3 = c(gps_controls, demo_controls),
  c4 = c(gps_controls, demo_controls, poli_controls),
  c5 = c(gps_controls, demo_controls, poli_controls, media_controls)
)

# ==============================================================================
# HELPER FUNCTIONS
# ==============================================================================

# Function to add significance stars
add_stars <- function(coef, pval) {
  stars <- case_when(
    pval < 0.01 ~ "***",
    pval < 0.05 ~ "**",
    pval < 0.1 ~ "*",
    TRUE ~ ""
  )
  paste0(round(coef, 3), stars)
}

# Function to run OLS, Reported Fear, and IV regressions
run_panel_regressions <- function(outcome, controls_list, data) {
  results <- list()

  for (i in seq_along(controls_list)) {
    ctrl <- controls_list[[i]]
    ctrl_formula <- if (is.null(ctrl)) "1" else paste(ctrl, collapse = " + ")

    # ITT (Intent to Treat) - Treatment effect
    itt_formula <- as.formula(paste(outcome, "~ treat +", ctrl_formula))
    itt_fit <- lm(itt_formula, data = data)

    # Reported Fear (OLS)
    rf_formula <- as.formula(paste(outcome, "~ iv +", ctrl_formula))
    rf_fit <- lm(rf_formula, data = data)

    # IV estimation (TOT - Treatment on Treated)
    iv_formula <- as.formula(paste(outcome, "~ iv +", ctrl_formula, "| treat +", ctrl_formula))
    iv_fit <- ivreg(iv_formula, data = data)

    results[[paste0("m", i)]] <- list(itt = itt_fit, rf = rf_fit, iv = iv_fit)
  }

  return(results)
}

# ==============================================================================
# TABLE 1: SUMMARY AND BALANCE CHECK
# ==============================================================================

cat("\n=== Generating Table 1: Summary and Balance Check ===\n")

# Create education and religion category labels
df_main <- df_main %>%
  mutate(
    edu_cat = factor(edu, levels = 1:6,
                     labels = c("Up to middle school", "High school (Graduated)",
                               "College (Enrolled)", "College (Graduated)",
                               "Graduate school (Enrolled)", "Graduate school (Graduated)")),
    rel_cat = factor(rel, levels = 1:5,
                     labels = c("Christianity", "Buddhism", "Catholicism",
                               "No Religion", "Etc.")),
    treatment_group = ifelse(treat == 1, "Fear Treatment", "Happiness Treatment")
  )

# Generate summary statistics by treatment group
summary_vars <- c("age", "sex", "ideology", "vote_inc",
                  "q77", "q77_n3", "q77_n2", "q77_n4", "q110_")

table1_data <- df_main %>%
  group_by(treatment_group) %>%
  summarise(
    across(all_of(summary_vars),
           list(N = ~sum(!is.na(.)),
                Mean = ~mean(., na.rm = TRUE),
                SD = ~sd(., na.rm = TRUE)),
           .names = "{.col}_{.fn}")
  )

write.csv(table1_data, "output/tables/table1_balance_check.csv", row.names = FALSE)

# Generate LaTeX version
table1_fear <- df_main %>% filter(treat == 1)
table1_happy <- df_main %>% filter(treat == 0)

table1_tex <- paste0(
  "\\begin{table}[H] \\normalsize\n",
  "\\centering \\singlespacing\n",
  "\\begin{adjustbox}{width=\\textwidth,center}\n",
  "\\begin{tabular}{l c c c c c c c c}\n",
  "\\hline\\hline\n",
  "& \\multicolumn{3}{c}{Fear Treatment} & & \\multicolumn{3}{c}{Happiness Treatment} \\\\\n",
  "& N & Mean & Std.Dev. & & N & Mean & Std.Dev.\\\\ \\hline\n",
  "\\\\\n",
  "\\multicolumn{8}{l}{Panel A : Demographic Characteristics} \\\\\n",
  "\\\\\n",
  sprintf("Age & %d & %.2f & %.2f && %d & %.2f & %.2f\\\\\n",
          sum(!is.na(table1_fear$age)), mean(table1_fear$age, na.rm=T), sd(table1_fear$age, na.rm=T),
          sum(!is.na(table1_happy$age)), mean(table1_happy$age, na.rm=T), sd(table1_happy$age, na.rm=T)),
  sprintf("Sex & %d & %.2f & %.2f && %d & %.2f & %.2f\\\\\n",
          sum(!is.na(table1_fear$sex)), mean(table1_fear$sex, na.rm=T), sd(table1_fear$sex, na.rm=T),
          sum(!is.na(table1_happy$sex)), mean(table1_happy$sex, na.rm=T), sd(table1_happy$sex, na.rm=T)),
  "\\\\\n",
  "\\multicolumn{8}{l}{Panel B : Political View} \\\\\n",
  "\\\\\n",
  sprintf("Ideology & %d & %.2f & %.2f && %d & %.2f & %.2f\\\\\n",
          sum(!is.na(table1_fear$ideology)), mean(table1_fear$ideology, na.rm=T), sd(table1_fear$ideology, na.rm=T),
          sum(!is.na(table1_happy$ideology)), mean(table1_happy$ideology, na.rm=T), sd(table1_happy$ideology, na.rm=T)),
  sprintf("Ruling Party Support & %d & %.2f & %.2f && %d & %.2f & %.2f\\\\\n",
          sum(!is.na(table1_fear$vote_inc)), mean(table1_fear$vote_inc, na.rm=T), sd(table1_fear$vote_inc, na.rm=T),
          sum(!is.na(table1_happy$vote_inc)), mean(table1_happy$vote_inc, na.rm=T), sd(table1_happy$vote_inc, na.rm=T)),
  "\\\\\n",
  "\\multicolumn{8}{l}{Panel C : Main Outcome Variables} \\\\\n",
  "\\\\\n",
  sprintf("Keep to Self & %d & %.2f & %.2f && %d & %.2f & %.2f\\\\\n",
          sum(!is.na(table1_fear$q77)), mean(table1_fear$q77, na.rm=T), sd(table1_fear$q77, na.rm=T),
          sum(!is.na(table1_happy$q77)), mean(table1_happy$q77, na.rm=T), sd(table1_happy$q77, na.rm=T)),
  sprintf("Ingroup Donation & %d & %.2f & %.2f && %d & %.2f & %.2f\\\\\n",
          sum(!is.na(table1_fear$q77_n3)), mean(table1_fear$q77_n3, na.rm=T), sd(table1_fear$q77_n3, na.rm=T),
          sum(!is.na(table1_happy$q77_n3)), mean(table1_happy$q77_n3, na.rm=T), sd(table1_happy$q77_n3, na.rm=T)),
  sprintf("Outgroup Donation & %d & %.2f & %.2f && %d & %.2f & %.2f\\\\\n",
          sum(!is.na(table1_fear$q77_n2)), mean(table1_fear$q77_n2, na.rm=T), sd(table1_fear$q77_n2, na.rm=T),
          sum(!is.na(table1_happy$q77_n2)), mean(table1_happy$q77_n2, na.rm=T), sd(table1_happy$q77_n2, na.rm=T)),
  sprintf("Total Donation & %d & %.2f & %.2f && %d & %.2f & %.2f\\\\\n",
          sum(!is.na(table1_fear$q77_n4)), mean(table1_fear$q77_n4, na.rm=T), sd(table1_fear$q77_n4, na.rm=T),
          sum(!is.na(table1_happy$q77_n4)), mean(table1_happy$q77_n4, na.rm=T), sd(table1_happy$q77_n4, na.rm=T)),
  sprintf("Clicked & %d & %.2f & %.2f && %d & %.2f & %.2f\\\\\n",
          sum(!is.na(table1_fear$q110_)), mean(table1_fear$q110_, na.rm=T), sd(table1_fear$q110_, na.rm=T),
          sum(!is.na(table1_happy$q110_)), mean(table1_happy$q110_, na.rm=T), sd(table1_happy$q110_, na.rm=T)),
  "\\hline\n",
  "\\end{tabular}\n",
  "\\end{adjustbox}\n",
  "\\caption{Summary and Balance Check} \\label{tab:table1_balance_check}\n",
  "\\end{table}\n"
)
writeLines(table1_tex, "output/tables/table1_balance_check.tex")

cat("Table 1 saved to output/tables/table1_balance_check.csv and .tex\n")

# ==============================================================================
# TABLE 2: EMOTIONS SUMMARY AND MANIPULATION CHECK
# ==============================================================================

cat("\n=== Generating Table 2: Emotions Summary and Manipulation Check ===\n")

emotion_vars <- c("q81_0_", "q81_0_n2_", "q81_0_n3_", "q81_0_n4_", "q81_0_n5_", "q81_0_n6_")
emotion_labels <- c("Fear", "Anger", "Happiness", "Sadness", "Disgust", "Surprise")

# Summary statistics
emotion_summary <- df_main %>%
  summarise(across(all_of(emotion_vars),
                   list(Mean = ~mean(., na.rm = TRUE),
                        SD = ~sd(., na.rm = TRUE),
                        N = ~sum(!is.na(.)))))

# Manipulation check regressions
manipulation_results <- data.frame(
  Emotion = emotion_labels,
  Coefficient = NA,
  SE = NA,
  pvalue = NA
)

for (i in seq_along(emotion_vars)) {
  reg <- lm(as.formula(paste(emotion_vars[i], "~ treat")), data = df_main)
  reg_summary <- summary(reg)$coefficients
  manipulation_results$Coefficient[i] <- reg_summary["treat", "Estimate"]
  manipulation_results$SE[i] <- reg_summary["treat", "Std. Error"]
  manipulation_results$pvalue[i] <- reg_summary["treat", "Pr(>|t|)"]
}

manipulation_results$Coefficient_stars <- mapply(add_stars,
                                                  manipulation_results$Coefficient,
                                                  manipulation_results$pvalue)

write.csv(manipulation_results, "output/tables/table2_manipulation_check.csv", row.names = FALSE)

# Generate LaTeX version for Table 2
table2_tex <- paste0(
  "\\begin{table}[H]\\singlespacing \\scriptsize\n",
  "\\centering\n",
  "\\begin{tabular}{lcccccc}\n",
  "\\hline\n",
  " & Mean & Std.Dev. & Manipulation Check \\\\\n",
  " & & & Coefficient (SE) \\\\\n",
  "\\hline\n"
)

for (i in seq_along(emotion_vars)) {
  overall_mean <- mean(df_main[[emotion_vars[i]]], na.rm = TRUE)
  overall_sd <- sd(df_main[[emotion_vars[i]]], na.rm = TRUE)
  coef_str <- sprintf("%.3f%s", manipulation_results$Coefficient[i],
                      ifelse(manipulation_results$pvalue[i] < 0.01, "***",
                             ifelse(manipulation_results$pvalue[i] < 0.05, "**",
                                    ifelse(manipulation_results$pvalue[i] < 0.1, "*", ""))))
  table2_tex <- paste0(table2_tex,
    sprintf("%s & %.3f & %.3f & %s (%.3f) \\\\\n",
            emotion_labels[i], overall_mean, overall_sd, coef_str, manipulation_results$SE[i]))
}

table2_tex <- paste0(table2_tex,
  "\\hline\n",
  "\\end{tabular}\n",
  "\\caption{Summary Statistics for Emotions and Manipulation Check} \\label{tab:table2_manipulation}\n",
  "\\end{table}\n"
)
writeLines(table2_tex, "output/tables/table2_manipulation_check.tex")

cat("Table 2 saved to output/tables/table2_manipulation_check.csv and .tex\n")

# ==============================================================================
# TABLE 3: MAIN RESULTS - EFFECTS OF TREATMENT ON DONATIONS
# ==============================================================================

cat("\n=== Generating Table 3: Main Results ===\n")

outcomes <- c("q77_n3", "q77_n2", "q77_n4", "q110_")
outcome_names <- c("Ingroup Donation", "Outgroup Donation", "Total Donation", "Seek Info")

# Run regressions for each outcome
main_results <- list()

for (j in seq_along(outcomes)) {
  outcome <- outcomes[j]
  panel_results <- run_panel_regressions(outcome, control_sets, df_main)

  # Extract key coefficients
  for (i in 1:length(control_sets)) {
    model_key <- paste0("m", i)

    # ITT
    itt_coef <- summary(panel_results[[model_key]]$itt)$coefficients["treat", ]

    # Reported Fear
    rf_coef <- summary(panel_results[[model_key]]$rf)$coefficients["iv", ]

    # IV
    iv_coef <- summary(panel_results[[model_key]]$iv)$coefficients["iv", ]

    main_results[[paste(outcome_names[j], "Model", i)]] <- data.frame(
      Outcome = outcome_names[j],
      Model = i,
      ITT_coef = itt_coef["Estimate"],
      ITT_se = itt_coef["Std. Error"],
      ITT_pval = itt_coef["Pr(>|t|)"],
      RF_coef = rf_coef["Estimate"],
      RF_se = rf_coef["Std. Error"],
      RF_pval = rf_coef["Pr(>|t|)"],
      IV_coef = iv_coef["Estimate"],
      IV_se = iv_coef["Std. Error"],
      IV_pval = iv_coef["Pr(>|t|)"]
    )
  }
}

main_results_df <- bind_rows(main_results)
write.csv(main_results_df, "output/tables/table3_main_results.csv", row.names = FALSE)

# Generate LaTeX version using stargazer for main specification (Model 5)
itt_models <- list()
rf_models <- list()
iv_models <- list()

for (j in seq_along(outcomes)) {
  outcome <- outcomes[j]
  ctrl <- control_sets[[5]]
  ctrl_formula <- paste(ctrl, collapse = " + ")

  itt_models[[j]] <- lm(as.formula(paste(outcome, "~ treat +", ctrl_formula)), data = df_main)
  rf_models[[j]] <- lm(as.formula(paste(outcome, "~ iv +", ctrl_formula)), data = df_main)
  iv_models[[j]] <- ivreg(as.formula(paste(outcome, "~ iv +", ctrl_formula, "| treat +", ctrl_formula)), data = df_main)
}

# ITT results
stargazer(itt_models[[1]], itt_models[[2]], itt_models[[3]], itt_models[[4]],
          type = "latex",
          out = "output/tables/table3_itt.tex",
          title = "Effects of Fear Treatment on Donations and Info-Seeking (ITT)",
          dep.var.labels = outcome_names,
          covariate.labels = c("Fear Treatment"),
          keep = "treat",
          omit.stat = c("f", "ser"),
          notes = "Standard errors in parentheses. * p<0.1, ** p<0.05, *** p<0.01",
          label = "tab:table3_itt")

# IV results
stargazer(iv_models[[1]], iv_models[[2]], iv_models[[3]], iv_models[[4]],
          type = "latex",
          out = "output/tables/table3_iv.tex",
          title = "Effects of Reported Fear on Donations and Info-Seeking (IV)",
          dep.var.labels = outcome_names,
          covariate.labels = c("Reported Fear (IV)"),
          keep = "iv",
          omit.stat = c("f", "ser"),
          notes = "Standard errors in parentheses. * p<0.1, ** p<0.05, *** p<0.01",
          label = "tab:table3_iv")

cat("Table 3 saved to output/tables/table3_main_results.csv, table3_itt.tex, table3_iv.tex\n")

# ==============================================================================
# TABLE 4: GPS VARIABLES AS OUTCOMES
# ==============================================================================

cat("\n=== Generating Table 4: GPS Variables as Outcomes ===\n")

gps_outcomes <- c("gps_alt", "gps_pr", "gps_nr", "gps_trust")
gps_outcome_names <- c("Altruism", "Positive Reciprocity", "Negative Reciprocity", "Trust")

gps_results <- list()

for (j in seq_along(gps_outcomes)) {
  outcome <- gps_outcomes[j]

  # No controls
  itt_m1 <- lm(as.formula(paste(outcome, "~ treat")), data = df_main)
  rf_m1 <- lm(as.formula(paste(outcome, "~ iv")), data = df_main)

  # Full controls (excluding GPS from controls)
  full_ctrl <- c(demo_controls, poli_controls, media_controls)
  ctrl_formula <- paste(full_ctrl, collapse = " + ")

  itt_m2 <- lm(as.formula(paste(outcome, "~ treat +", ctrl_formula)), data = df_main)
  rf_m2 <- lm(as.formula(paste(outcome, "~ iv +", ctrl_formula)), data = df_main)

  gps_results[[gps_outcome_names[j]]] <- data.frame(
    Outcome = gps_outcome_names[j],
    ITT_nocontrol = summary(itt_m1)$coefficients["treat", "Estimate"],
    ITT_nocontrol_se = summary(itt_m1)$coefficients["treat", "Std. Error"],
    ITT_full = summary(itt_m2)$coefficients["treat", "Estimate"],
    ITT_full_se = summary(itt_m2)$coefficients["treat", "Std. Error"],
    RF_nocontrol = summary(rf_m1)$coefficients["iv", "Estimate"],
    RF_nocontrol_se = summary(rf_m1)$coefficients["iv", "Std. Error"],
    RF_full = summary(rf_m2)$coefficients["iv", "Estimate"],
    RF_full_se = summary(rf_m2)$coefficients["iv", "Std. Error"]
  )
}

gps_results_df <- bind_rows(gps_results)
write.csv(gps_results_df, "output/tables/table4_gps_outcomes.csv", row.names = FALSE)
cat("Table 4 saved to output/tables/table4_gps_outcomes.csv\n")

# ==============================================================================
# TABLE 5: MEDIATION ANALYSIS
# ==============================================================================

cat("\n=== Generating Table 5: Mediation Analysis ===\n")

# Mediation analysis for altruism
med_fit <- lm(gps_alt ~ treat, data = df_main)
out_fit <- lm(q77_n2 ~ gps_alt + treat, data = df_main)

med_result <- tryCatch({
  mediate(med_fit, out_fit, treat = "treat", mediator = "gps_alt",
          boot = TRUE, sims = 1000)
}, error = function(e) {
  cat("Mediation analysis requires 'mediation' package. Skipping...\n")
  NULL
})

if (!is.null(med_result)) {
  mediation_summary <- data.frame(
    Mediator = "Altruism",
    ACME = med_result$d0,
    ACME_pval = med_result$d0.p,
    ADE = med_result$z0,
    ADE_pval = med_result$z0.p,
    Total = med_result$tau.coef,
    Total_pval = med_result$tau.p,
    Prop_Mediated = med_result$n0
  )
  write.csv(mediation_summary, "output/tables/table5_mediation.csv", row.names = FALSE)
  cat("Table 5 saved to output/tables/table5_mediation.csv\n")
}

# ==============================================================================
# TABLE 6: AEMT TEXT ANALYSIS (Placeholder - requires text classification)
# ==============================================================================

cat("\n=== Table 6: AEMT Text Analysis ===\n")
cat("Note: Table 6 requires text classification of AEMT responses.\n")
cat("This analysis uses supervised machine learning (RTextTools) to classify\n")
cat("responses into Health, Economic, and Other categories.\n")

# ==============================================================================
# APPENDIX TABLES
# ==============================================================================

cat("\n=== Generating Appendix Tables ===\n")

# --- Table A1: Main Results with First 4,000 Observations ---
cat("Generating Table A1: First 4,000 observations...\n")

df_first4000 <- df_main %>% slice(1:4000)
a1_results <- run_panel_regressions("q77_n2", control_sets, df_first4000)

# Extract and save results
a1_summary <- data.frame(
  Model = 1:5,
  ITT_coef = sapply(1:5, function(i) summary(a1_results[[paste0("m",i)]]$itt)$coefficients["treat", "Estimate"]),
  ITT_se = sapply(1:5, function(i) summary(a1_results[[paste0("m",i)]]$itt)$coefficients["treat", "Std. Error"])
)
write.csv(a1_summary, "output/tables/tableA1_first4000.csv", row.names = FALSE)

# --- Table A3: Media Treatment Effects ---
cat("Generating Table A3: Media treatment effects...\n")

# Create media treatment variable from gubun3 column
# gubun3 contains: 한국인 (Korean), 조선족 (Korean-Chinese), 중국인 (Chinese)
df_main <- df_main %>%
  mutate(
    media_korean = ifelse(gubun3 == "한국인", 1, 0),
    media_korean_chinese = ifelse(gubun3 == "조선족", 1, 0),
    media_chinese = ifelse(gubun3 == "중국인", 1, 0)
  )

# Run regressions with media treatment (with and without controls)
media_outcomes <- c("q77_n3", "q77_n2", "q77_n4", "q110_")
media_results <- list()

for (outcome in media_outcomes) {
  # Model with fear treatment and media treatment
  fit1 <- lm(as.formula(paste(outcome, "~ treat")), data = df_main)
  fit2 <- lm(as.formula(paste(outcome, "~ media_korean + media_korean_chinese")), data = df_main)
  fit3 <- lm(as.formula(paste(outcome, "~ treat + media_korean + media_korean_chinese + treat:media_korean + treat:media_korean_chinese")),
            data = df_main)

  media_results[[paste(outcome, "fear_only")]] <- tidy(fit1) %>% mutate(outcome = outcome, model = "fear_only")
  media_results[[paste(outcome, "media_only")]] <- tidy(fit2) %>% mutate(outcome = outcome, model = "media_only")
  media_results[[paste(outcome, "interaction")]] <- tidy(fit3) %>% mutate(outcome = outcome, model = "interaction")
}

write.csv(bind_rows(media_results),
          "output/tables/tableA3_media_treatment.csv", row.names = FALSE)

# --- Table A4: Political Interactions ---
cat("Generating Table A4: Political interactions...\n")

poli_int_results <- list()
for (outcome in c("q77_n2", "q110_")) {
  # Ideology interaction
  fit1 <- lm(as.formula(paste(outcome, "~ treat * ideology")), data = df_main)
  fit2 <- lm(as.formula(paste(outcome, "~ treat * vote_inc")), data = df_main)

  poli_int_results[[paste(outcome, "ideology")]] <- tidy(fit1)
  poli_int_results[[paste(outcome, "vote_inc")]] <- tidy(fit2)
}

write.csv(bind_rows(poli_int_results, .id = "model"),
          "output/tables/tableA4_political_interactions.csv", row.names = FALSE)

# --- Table A5: Media Interactions ---
cat("Generating Table A5: Media interactions...\n")

media_int_results <- list()
for (outcome in c("q77_n2", "q110_")) {
  fit <- lm(as.formula(paste(outcome, "~ treat * news_consumption")), data = df_main)
  media_int_results[[outcome]] <- tidy(fit)
}

write.csv(bind_rows(media_int_results, .id = "outcome"),
          "output/tables/tableA5_media_interactions.csv", row.names = FALSE)

# --- Table A6: KGSS Outcomes ---
cat("Generating Table A6: KGSS outcomes...\n")

kgss_outcome_vars <- c("q82_", "q82_n2_", "q82_n3_", "q82_n4_", "q82_n5_", "q82_n6_")
kgss_labels <- c("North Korean Defectors", "Foreign Production Workers",
                 "Foreign Professional Workers", "Korean Chinese",
                 "Foreign Students", "Foreign Businessmen")

kgss_results <- list()
for (i in seq_along(kgss_outcome_vars)) {
  outcome <- kgss_outcome_vars[i]

  # ITT
  itt_fit <- lm(as.formula(paste(outcome, "~ treat")), data = df_main)
  rf_fit <- lm(as.formula(paste(outcome, "~ iv")), data = df_main)

  kgss_results[[kgss_labels[i]]] <- data.frame(
    Outcome = kgss_labels[i],
    ITT_coef = summary(itt_fit)$coefficients["treat", "Estimate"],
    ITT_se = summary(itt_fit)$coefficients["treat", "Std. Error"],
    ITT_pval = summary(itt_fit)$coefficients["treat", "Pr(>|t|)"],
    RF_coef = summary(rf_fit)$coefficients["iv", "Estimate"],
    RF_se = summary(rf_fit)$coefficients["iv", "Std. Error"],
    RF_pval = summary(rf_fit)$coefficients["iv", "Pr(>|t|)"]
  )
}

write.csv(bind_rows(kgss_results), "output/tables/tableA6_kgss.csv", row.names = FALSE)

# --- Table A7: WVS Outcomes ---
cat("Generating Table A7: WVS outcomes...\n")

wvs_outcome_vars <- c("q94_1", "q94_2", "q94_3", "q94_4", "q94_5", "q94_6", "q94_7")
wvs_labels <- c("Persons With Disabilities", "Foreign Immigrants/Workers",
                "Members Of A Cult", "Former Convicts", "Sexual Minorities",
                "North Korean Defectors", "Refugees")

wvs_results <- list()
for (i in seq_along(wvs_outcome_vars)) {
  outcome <- wvs_outcome_vars[i]

  if (outcome %in% names(df_main)) {
    itt_fit <- lm(as.formula(paste(outcome, "~ treat")), data = df_main)
    rf_fit <- lm(as.formula(paste(outcome, "~ iv")), data = df_main)

    wvs_results[[wvs_labels[i]]] <- data.frame(
      Outcome = wvs_labels[i],
      ITT_coef = summary(itt_fit)$coefficients["treat", "Estimate"],
      ITT_se = summary(itt_fit)$coefficients["treat", "Std. Error"],
      RF_coef = summary(rf_fit)$coefficients["iv", "Estimate"],
      RF_se = summary(rf_fit)$coefficients["iv", "Std. Error"]
    )
  }
}

if (length(wvs_results) > 0) {
  write.csv(bind_rows(wvs_results), "output/tables/tableA7_wvs.csv", row.names = FALSE)
}

# --- Table A9: T-Test Results for GPS Variables ---
cat("Generating Table A9: T-test results...\n")

gps_ttest_vars <- c("gps_alt", "gps_pr", "gps_nr", "gps_trust", "gps_time", "gps_risk")
gps_ttest_labels <- c("Altruism", "Positive Reciprocity", "Negative Reciprocity",
                       "Trust", "Time Preference", "Risk Preference")

ttest_results <- list()
for (i in seq_along(gps_ttest_vars)) {
  var <- gps_ttest_vars[i]

  control_vals <- df_main[[var]][df_main$treat == 0]
  treat_vals <- df_main[[var]][df_main$treat == 1]

  ttest <- t.test(treat_vals, control_vals)

  ttest_results[[gps_ttest_labels[i]]] <- data.frame(
    Variable = gps_ttest_labels[i],
    Control_Mean = mean(control_vals, na.rm = TRUE),
    Treatment_Mean = mean(treat_vals, na.rm = TRUE),
    Mean_Diff = ttest$estimate[1] - ttest$estimate[2],
    t_value = ttest$statistic,
    p_value = ttest$p.value,
    CI_lower = ttest$conf.int[1],
    CI_upper = ttest$conf.int[2]
  )
}

write.csv(bind_rows(ttest_results), "output/tables/tableA9_ttest.csv", row.names = FALSE)

# --- Table A2: Benjamini-Hochberg Corrected P-values ---
cat("Generating Table A2: BH corrected p-values...\n")

# Run all 5 specifications for outgroup donation and collect p-values
bh_results <- data.frame(
  Model = 1:5,
  ITT_coef = NA, ITT_se = NA, ITT_pval = NA, ITT_pval_bh = NA,
  RF_coef = NA, RF_se = NA, RF_pval = NA, RF_pval_bh = NA,
  IV_coef = NA, IV_se = NA, IV_pval = NA, IV_pval_bh = NA
)

for (i in 1:5) {
  ctrl <- control_sets[[i]]
  ctrl_formula <- if (is.null(ctrl)) "1" else paste(ctrl, collapse = " + ")

  itt_fit <- lm(as.formula(paste("q77_n2 ~ treat +", ctrl_formula)), data = df_main)
  rf_fit <- lm(as.formula(paste("q77_n2 ~ iv +", ctrl_formula)), data = df_main)
  iv_fit <- ivreg(as.formula(paste("q77_n2 ~ iv +", ctrl_formula, "| treat +", ctrl_formula)), data = df_main)

  bh_results$ITT_coef[i] <- summary(itt_fit)$coefficients["treat", "Estimate"]
  bh_results$ITT_se[i] <- summary(itt_fit)$coefficients["treat", "Std. Error"]
  bh_results$ITT_pval[i] <- summary(itt_fit)$coefficients["treat", "Pr(>|t|)"]

  bh_results$RF_coef[i] <- summary(rf_fit)$coefficients["iv", "Estimate"]
  bh_results$RF_se[i] <- summary(rf_fit)$coefficients["iv", "Std. Error"]
  bh_results$RF_pval[i] <- summary(rf_fit)$coefficients["iv", "Pr(>|t|)"]

  bh_results$IV_coef[i] <- summary(iv_fit)$coefficients["iv", "Estimate"]
  bh_results$IV_se[i] <- summary(iv_fit)$coefficients["iv", "Std. Error"]
  bh_results$IV_pval[i] <- summary(iv_fit)$coefficients["iv", "Pr(>|t|)"]
}

# Apply Benjamini-Hochberg correction
bh_results$ITT_pval_bh <- p.adjust(bh_results$ITT_pval, method = "BH")
bh_results$RF_pval_bh <- p.adjust(bh_results$RF_pval, method = "BH")
bh_results$IV_pval_bh <- p.adjust(bh_results$IV_pval, method = "BH")

write.csv(bh_results, "output/tables/tableA2_bh_corrected.csv", row.names = FALSE)

# --- Table A8: Global Values IV Analysis ---
cat("Generating Table A8: Global values IV...\n")

gps_iv_vars <- c("gps_alt", "gps_pr", "gps_nr", "gps_trust")
gps_iv_labels <- c("Altruism", "Positive Reciprocity", "Negative Reciprocity", "Trust")

gps_iv_results <- list()
for (i in seq_along(gps_iv_vars)) {
  outcome <- gps_iv_vars[i]

  # IV regression: GPS variable as outcome, treat as instrument for iv
  iv_fit <- ivreg(as.formula(paste(outcome, "~ iv | treat")), data = df_main)

  gps_iv_results[[gps_iv_labels[i]]] <- data.frame(
    Outcome = gps_iv_labels[i],
    IV_coef = summary(iv_fit)$coefficients["iv", "Estimate"],
    IV_se = summary(iv_fit)$coefficients["iv", "Std. Error"],
    IV_pval = summary(iv_fit)$coefficients["iv", "Pr(>|t|)"],
    R2 = summary(iv_fit)$r.squared
  )
}

write.csv(bind_rows(gps_iv_results), "output/tables/tableA8_gps_iv.csv", row.names = FALSE)

# ==============================================================================
# FIGURE A1: WORD CLOUD GENERATION
# ==============================================================================

cat("\n=== Figure A1: Word Cloud Generation ===\n")

# The word clouds require:
# 1. Pre-translated text responses (from Google Translate API)
# 2. Cause classifications (Economic, Health, Other)
#
# Due to Google Translate API licensing, the translated data is not included.
# Pre-generated figures are provided in output/figures/

translated_file <- "data/fear_translated.csv"

if (file.exists(translated_file)) {

  # Check if quanteda packages are available
  if (require(quanteda) && require(quanteda.textplots)) {

    cat("Loading translated text data...\n")
    df_text <- read.csv(translated_file, stringsAsFactors = FALSE)

    # Check required columns
    if (all(c("fear_traslated", "cause") %in% names(df_text))) {

      # -------------------------------------------------------------------------
      # Figure A1a: Pooled Word Cloud (All Fear Text)
      # -------------------------------------------------------------------------

      corp_fear <- corpus(df_text$fear_traslated,
                          docvars = data.frame(cause = df_text$cause))

      dfm_fear <- dfm(corp_fear,
                      remove = stopwords("english"),
                      stem = TRUE,
                      remove_punct = TRUE)

      png("output/figures/Word Cloud of the Pooled AMET (Fear).png",
          width = 800, height = 600, res = 100)

      textplot_wordcloud(dfm_fear,
                         min_count = 6,
                         random_order = FALSE,
                         rotation = 0.25,
                         color = RColorBrewer::brewer.pal(8, "Dark2"))

      dev.off()
      cat("Generated: Word Cloud of the Pooled AMET (Fear).png\n")

      # -------------------------------------------------------------------------
      # Figure A1b: Comparison Word Cloud by Cause
      # -------------------------------------------------------------------------

      df_classified <- df_text %>%
        filter(cause %in% c("Economic", "Health", "Other"))

      if (nrow(df_classified) > 0) {

        corp_classified <- corpus(df_classified$fear_traslated,
                                  docvars = data.frame(cause = df_classified$cause))

        png("output/figures/Word Cloud by Economics, Health, and Others.png",
            width = 1200, height = 600, res = 100)

        corp_classified %>%
          dfm(groups = "cause",
              remove = c(stopwords("english"), "none"),
              stem = TRUE,
              remove_punct = TRUE) %>%
          dfm_trim(min_termfreq = 5, verbose = FALSE) %>%
          textplot_wordcloud(comparison = TRUE, labelsize = 1)

        dev.off()
        cat("Generated: Word Cloud by Economics, Health, and Others.png\n")
      }

    } else {
      cat("Note: Translated data missing required columns (fear_traslated, cause).\n")
    }

  } else {
    cat("Note: quanteda/quanteda.textplots packages not installed.\n")
    cat("To generate word clouds, install: install.packages(c('quanteda', 'quanteda.textplots'))\n")
  }

} else {
  cat("Note: Translated text data (fear_translated.csv) not found.\n")
  cat("Pre-generated word cloud figures are available in output/figures/\n")
  cat("  - Word Cloud of the Pooled AMET (Fear).png\n")
  cat("  - Word Cloud by Economics, Health, and Others.png\n")
}

# ==============================================================================
# COMPLETION
# ==============================================================================

cat("\n")
cat("================================================================================\n")
cat("REPLICATION COMPLETE\n")
cat("================================================================================\n")
cat("\nAll tables have been saved to the 'output/tables/' directory.\n")
cat("Figures are available in the 'output/figures/' directory.\n")
cat("\nGenerated files:\n")
cat("\nTables:\n")
print(list.files("output/tables/", pattern = "*.csv"))
cat("\nFigures:\n")
print(list.files("output/figures/", pattern = "*.png"))
